ICGC-CLL cohort with RNAseq data (EGAS00001000374)
Pre-processing data
Download ICGC dataset from EMBL server (The raw sequencing, genomic and outcome data can be queried from ICGC portatl: https://icgc.org/icgc/cgp/64/530/826)
icgcPath <- file.path(tempdir(),"rnaICGC.RData")
if(!file.exists(icgcPath)) {
download.file("https://www.huber.embl.de/users/jlu/data/rnaICGC.RData",
icgcPath)
load(icgcPath)
} else {
load(icgcPath)
}
Load and process ICGC expression dataset
rnaExt.vst <- varianceStabilizingTransformation(rnaExternal)
#Adjusted for potential batch effect
exprMat <- assay(rnaExt.vst)
batch <- as.factor(rnaExt.vst$raw_data_accession)
exprMat.combat <- sva::ComBat(exprMat, batch = batch)
## Found 426 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
assay(rnaExt.vst) <- exprMat.combat
Load and process internal RNAseq dataset
rna$factor <- facTab[match(colnames(rna), facTab$sample),]$value
rna <- rna[,!is.na(rna$factor)] #use samples with LF4 values
rna.vst <- varianceStabilizingTransformation(rna)
Filter out low variant genes in both datasets
exprMat <- assay(rna.vst)
exprMat.ext <- assay(rnaExt.vst)
#icgc dataset
sds <- genefilter::rowSds(exprMat.ext)
exprMat.ext <- exprMat.ext[order(sds,decreasing = TRUE)[1:10000],]
#internal dataset
sds <- genefilter::rowSds(exprMat)
exprMat <- exprMat[order(sds,decreasing = TRUE)[1:10000],]
#subset to keep common genes
overGene <- intersect(rownames(exprMat), rownames(exprMat.ext))
exprMat <- exprMat[overGene,]
exprMat.ext <- exprMat.ext[overGene,]
Remove highly correlated gene in training data
X <- mscale(exprMat)
#remove highly correlated genes
reRes <- removeCorrelated(t(X), cutoff = 0.9, method = "pearson", record = FALSE)
X <- reRes$reduced
Test assocation with clincial outcome
Prepare outcome table based on patient annotations
survT <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("sampleID") %>%
mutate(predLF = y.pred[sampleID]) %>%
dplyr::select(sampleID, treatedAfter, TTT, predLF, OS, died) %>%
dplyr::rename(patientID = sampleID) %>%
filter(!(is.na(TTT) & is.na(OS))) #remove samples with none outcome avaialble
#how many samples
nrow(survT)
## [1] 249
Prepare risk factor table
riskTab <- colData(rnaExternal) %>% data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column("sampleID") %>%
dplyr::select(sampleID, IGHV, TP53, del17p, age, sex, SF3B1, NOTCH1) %>%
mutate(`TP53.del17p` = TP53 | del17p, age = age/10) %>%
mutate_if(is.logical, as.numeric) %>%
select(-TP53, -del17p) %>%
dplyr::rename(patientID = sampleID) %>%
mutate(LF4 = y.pred[patientID], IGHV =as.factor(IGHV))
outTab <- left_join(riskTab, survT, by = "patientID")
#write_csv2(outTab, "sourceData_2cdef.csv")
#write_csv2(survT, "sourceData_2b_ICGC")
Univariate test
Overall survival
#calculate p value for cox regression
pOS <- comSurv(survT$predLF, survT$OS, survT$died)
kmOS <- km(survT$predLF, survT$OS, survT$died, "Overall survival (ICGC-CLL)",
stat = "maxstat",pval = pOS$p, showTable = TRUE)
kmOS

Time to treatment
pTTT <- comSurv(survT$predLF, survT$TTT, survT$treatedAfter)
kmTTT <- km(survT$predLF, survT$TTT, survT$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "maxstat", pval = pTTT$p, showTable = TRUE)
kmTTT

Prepare a summary table
sumOutcome <- bind_rows(mutate(pTTT, outcome = "TTT"),mutate(pOS,outcome = "OS")) %>% mutate(cohort = "ICGC-CLL", n=nrow(survT))
Combine IGHV and CLL-PD
KM plot for subgroup defined by IGHV status and median latent factor values
groupTab <- survT %>% mutate(IGHV = rnaExternal[,patientID]$IGHV) %>%
filter(!is.na(IGHV)) %>%
mutate(group = ifelse(predLF > median(predLF),"highPD","lowPD")) %>%
mutate(subgroup = paste0(IGHV,"_",group))
groupList <- list()
# TTT
groupList[["TTT"]] <- km(groupTab$subgroup, groupTab$TTT, groupTab$treatedAfter, "Time to treatment (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)
# OS
groupList[["OS"]] <- km(groupTab$subgroup, groupTab$OS, groupTab$died, "Overall survival (ICGC-CLL)", stat = "binary", showP = TRUE, showTable = TRUE, yLabelAdjust = -5)
grid.arrange(grobs = groupList, ncol = 2)

Multivariate test
OS
surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD= LF4), "OS", "died")
summary(surv1)
## Call:
## coxph(formula = Surv(time, endpoint) ~ ., data = testTab)
##
## n= 247, number of events= 81
## (6 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## IGHVU 0.988694 2.687722 0.238190 4.151 3.31e-05 ***
## age 0.716112 2.046461 0.125042 5.727 1.02e-08 ***
## sexm 0.335878 1.399168 0.263964 1.272 0.2032
## SF3B1 0.215390 1.240346 0.375213 0.574 0.5659
## NOTCH1 -0.003541 0.996465 0.310940 -0.011 0.9909
## TP53.del17p 0.196096 1.216643 0.444733 0.441 0.6593
## CLLPD 0.644055 1.904186 0.258668 2.490 0.0128 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## IGHVU 2.6877 0.3721 1.6851 4.287
## age 2.0465 0.4886 1.6016 2.615
## sexm 1.3992 0.7147 0.8340 2.347
## SF3B1 1.2403 0.8062 0.5945 2.588
## NOTCH1 0.9965 1.0035 0.5417 1.833
## TP53.del17p 1.2166 0.8219 0.5089 2.909
## CLLPD 1.9042 0.5252 1.1469 3.161
##
## Concordance= 0.781 (se = 0.027 )
## Likelihood ratio test= 71.14 on 7 df, p=9e-13
## Wald test = 67.1 on 7 df, p=6e-12
## Score (logrank) test = 70.23 on 7 df, p=1e-12
haOS <- plotHazard(surv1, title = "Overall survival (ICGC-CLL)") +
scale_y_log10(limits = c(0.3,8))
haOS

Time to treatment
surv1 <- runCox(survT, dplyr::rename(riskTab, CLLPD = LF4), "TTT", "treatedAfter")
summary(surv1)
## Call:
## coxph(formula = Surv(time, endpoint) ~ ., data = testTab)
##
## n= 233, number of events= 127
## (20 observations deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## IGHVU 1.41100 4.10006 0.19162 7.364 1.79e-13 ***
## age -0.33373 0.71625 0.07811 -4.273 1.93e-05 ***
## sexm 0.15982 1.17331 0.19901 0.803 0.42191
## SF3B1 0.77086 2.16162 0.27490 2.804 0.00504 **
## NOTCH1 0.32928 1.38997 0.25965 1.268 0.20474
## TP53.del17p 0.05686 1.05850 0.35661 0.159 0.87332
## CLLPD 0.56004 1.75074 0.19324 2.898 0.00375 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## IGHVU 4.1001 0.2439 2.8163 5.9689
## age 0.7162 1.3962 0.6146 0.8347
## sexm 1.1733 0.8523 0.7944 1.7330
## SF3B1 2.1616 0.4626 1.2612 3.7049
## NOTCH1 1.3900 0.7194 0.8356 2.3122
## TP53.del17p 1.0585 0.9447 0.5262 2.1293
## CLLPD 1.7507 0.5712 1.1988 2.5568
##
## Concordance= 0.738 (se = 0.02 )
## Likelihood ratio test= 98.86 on 7 df, p=<2e-16
## Wald test = 99.28 on 7 df, p=<2e-16
## Score (logrank) test = 115.1 on 7 df, p=<2e-16
haTTT <- plotHazard(surv1, title = "Time to treatment (ICGC-CLL)") +
scale_y_log10(limits = c(0.3,8))
haTTT

Correlation between predicted CLL-PD and demographics
Age
plotTab <- riskTab %>%
mutate(C1C2 = paste0("C",rnaExternal[,match(patientID, colnames(rnaExternal))]$`C1C2`),
sex = ifelse(is.na(sex),NA, ifelse(sex =="f" , "Female","Male")))
corRes <- cor.test(plotTab$age, plotTab$LF4)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoN <- sprintf("N = %s", nrow(filter(plotTab,!is.na(age))))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)
plotAge <- ggplot(plotTab, aes(x = age, y = LF4)) +
geom_point(color =colList[3]) +
geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) +
annotate("text", x = max(plotTab$age), y = Inf, label = annoN,
hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
annotate("text", x = max(plotTab$age), y = Inf, label = annoP,
hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
annotate("text", x = max(plotTab$age), y = Inf, label = annoCoef,
hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
ylab("predicted CLL-PD") + xlab("Age (years)") +
theme_full
plotAge

Sex
corRes <- t.test(LF4 ~ sex, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))
plotTab <- group_by(plotTab, sex) %>% mutate(n=n()) %>% ungroup() %>%
mutate(sex = sprintf("%s\n(N=%s)",sex,n))
plotSex <- ggplot(plotTab, aes(x = sex, y = LF4)) +
geom_violin(aes(fill = sex)) +
geom_point() +
annotate("text", x = Inf, y = Inf, label = annoP,
hjust=1.2, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
scale_fill_manual(values = colList) +
ylab("predicted CLL-PD") + xlab("Sex") +
theme_full + theme(legend.position = "none")
plotSex

Plot C1/C2
plotTab <- filter(plotTab, C1C2 != "CNA")
corRes <- t.test(LF4 ~ C1C2, plotTab)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoP <- bquote(italic("P")~"="~.(pval))
plotTab <- filter(plotTab, !is.na(C1C2)) %>%
group_by(C1C2) %>% mutate(n=n()) %>% ungroup() %>%
mutate(C1C2 = sprintf("%s\n(N=%s)",C1C2,n))
plotC1C2 <- ggplot(filter(plotTab,!is.na(C1C2)), aes(x = C1C2, y = LF4)) +
geom_violin(aes(fill = C1C2)) +
geom_point() +
annotate("text", x = Inf, y = Inf, label = annoP,
hjust=1.1, vjust =2, size = 5, parse = FALSE, col= colList[1]) +
scale_fill_manual(values = colList[3:length(colList)]) +
ylab("predicted CLL-PD") + xlab("C1/C2 group") +
theme_full + theme(legend.position = "none")
plotC1C2

Association with genomics
#t-test
y <- y.pred
geneICGC <- geneICGC[names(y),]
tRes <- apply(geneICGC, 2, function(x) {
res <- t.test(y ~ as.factor(x), var.equal = TRUE)
data.frame(p = res$p.value,
df = res$estimate[[2]] - res$estimate[[1]])
}) %>% bind_rows() %>% mutate(gene = colnames(geneICGC),
p.adj = p.adjust(p, method = "BH")) %>%
arrange(p)
filter(tRes, p.adj <0.05) %>% mutate_if(is.numeric, formatNum, digits =3, format = "e") %>% DT::datatable()
plotGeneVolcano <- plotVolcano(tRes, posCol = colList[1], negCol = colList[2],
x_lab = "Difference of mean", ifLabel = TRUE) +
theme(legend.position = "none",
plot.margin = margin(8,8,8,8),
axis.title.y = element_text(vjust=0))
plotGeneVolcano

Assocation with mutation load
plotTab <- tibble(value = y.pred,load=rnaExternal[,names(y.pred)]$mutationLoad) %>%
filter(!is.na(load))
corRes <- cor.test(plotTab$load, plotTab$value)
pval <- formatNum(corRes$p.value, digits = 1, format = "e")
annoN <- sprintf("N = %s", nrow(plotTab))
annoP <- bquote(italic("P")~"="~.(pval))
annoCoef <- sprintf("coefficient = %1.2f",corRes$estimate)
plotMut <- ggplot(plotTab, aes(x = load, y = value)) +
geom_point(color =colList[6]) +
geom_smooth(method = "lm", se=FALSE, color = "grey50", linetype ="dashed" ) +
annotate("text", x = max(plotTab$load), y = Inf, label = annoN,
hjust=1, vjust =1.5, size = 5, parse = FALSE, col= colList[1]) +
annotate("text", x = max(plotTab$load), y = Inf, label = annoP,
hjust=1, vjust =3.5, size = 5, parse = FALSE, col= colList[1]) +
annotate("text", x = max(plotTab$load), y = Inf, label = annoCoef,
hjust=1, vjust =5.5, size = 5, parse = FALSE, col= colList[1]) +
ylab("predicted CLL-PD") + xlab("Number of somatic mutations detected by WGS") +
theme_full
plotMut
